Load data, prepared fpkm values and ortholog annonation
datasets = as.data.frame(scan("Stanford_datasets.txt",list(setname="",seqBatch="",species="",tissue=""),sep="\t"))
rawCounts <- as.matrix(read.table('Stanford_datasets_rawCountsMat.txt',header=FALSE,sep='\t'))
geneDetails <- as.data.frame(scan("ortholog_GC_table.txt",skip=1,list(mouse_name="",mouse_GC = 0.0,human_name = "",human_GC=0.0)))
Set columns names
colnames(rawCounts) <- datasets$setname
rownames(rawCounts) <- geneDetails$human_name
rownames(datasets) <- datasets$setname
rownames(geneDetails) <- geneDetails$human_name
Filter out lower 30% and mitochondrial genes
rowSums = apply(rawCounts,1,function(x) sum(x))
quantile(rowSums,probs = 0.3) # result is 2947.9
## 30%
## 2947.9
filter <- apply(rawCounts,1,function(x) sum(x)>2947.9 )
mt <- grep("mt-",geneDetails$mouse_name)
filteredNames <- setdiff(rownames(rawCounts[filter,]),rownames(rawCounts[mt,]))
filteredRawCounts <- rawCounts[filteredNames,]
Normalize data accounting for GC content
GCnormCounts <- filteredRawCounts
GCnormCounts[,1:13] <- withinLaneNormalization(filteredRawCounts[,1:13],geneDetails[filteredNames,"human_GC"],which="loess",round=TRUE)
GCnormCounts[,14:26] <- withinLaneNormalization(filteredRawCounts[,14:26],geneDetails[filteredNames,"mouse_GC"],which="loess",round=TRUE)
Depth normalize using TMM scaling factors
origColSums <- apply(rawCounts,2,function(x) sum(x))
normFactors <- calcNormFactors(GCnormCounts,method='TMM')
colSums = apply(GCnormCounts,2,function(x) sum(x))
normalizedColSums <- origColSums
i <- 1
while (i<length(colSums)){
normalizedColSums[i] <- origColSums[i]* normFactors[i]
i <- i+1
}
meanDepth <- mean(normalizedColSums)
filteredDepthNormCounts <- GCnormCounts
i <- 1
while (i<ncol(filteredDepthNormCounts)){
filteredDepthNormCounts[,i] <- (GCnormCounts[,i]/normalizedColSums[i])*meanDepth
i <- i+1
}
Apply Combat method to correct batch effect
meta <- data.frame(seqBatch = datasets$seqBatch,tissue=datasets$tissue,species=datasets$species)
design <- model.matrix(~1,data=meta)
combat <- ComBat(dat= logTransformedDepthNormCounts,batch=meta$seqBatch,mod=design,par.prior=TRUE)
## Found 190 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found5batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
Apply PCA method to corrected data
transposed_combat <- t(combat)
pca_proc <- prcomp(transposed_combat[,apply(transposed_combat, 2, var, na.rm=TRUE) != 0],scale=TRUE,center=TRUE,retX=TRUE)
Check PCA statistics
summary(pca_proc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 46.1324 34.5088 29.83714 28.62700 26.4953 25.58011
## Proportion of Variance 0.2064 0.1155 0.08636 0.07949 0.0681 0.06347
## Cumulative Proportion 0.2064 0.3220 0.40831 0.48781 0.5559 0.61938
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 21.40856 20.27110 18.77193 18.45646 16.76072 16.33343
## Proportion of Variance 0.04446 0.03986 0.03418 0.03304 0.02725 0.02588
## Cumulative Proportion 0.66384 0.70370 0.73788 0.77092 0.79817 0.82405
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 15.49347 15.16600 14.82063 13.83479 12.83094 12.59261
## Proportion of Variance 0.02329 0.02231 0.02131 0.01857 0.01597 0.01538
## Cumulative Proportion 0.84734 0.86965 0.89095 0.90952 0.92549 0.94087
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 11.63296 11.16385 10.52465 10.46957 8.10270 6.59013
## Proportion of Variance 0.01313 0.01209 0.01074 0.01063 0.00637 0.00421
## Cumulative Proportion 0.95400 0.96609 0.97683 0.98747 0.99383 0.99805
## PC25 PC26
## Standard deviation 4.48659 8.597e-14
## Proportion of Variance 0.00195 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
Transfer PCA data to plots
plotData = datasets[,c("setname","species","tissue")]
plotData$PC1 <- pca_proc$x[,1]
plotData$PC2 <- pca_proc$x[,2]
plotData$PC3 <- pca_proc$x[,3]
Plot the first and the second principal components

Plot the first and the second principal components with centroids

Plot the first, the second and the third principal components
Test for significance of correlations between the matched tissues PC values of human and mouse
cor.test(plotData$PC1[1:13],plotData$PC1[14:26],method="pearson")
##
## Pearson's product-moment correlation
##
## data: plotData$PC1[1:13] and plotData$PC1[14:26]
## t = 1.0409, df = 11, p-value = 0.3202
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3012326 0.7299944
## sample estimates:
## cor
## 0.2994546
cor.test(plotData$PC2[1:13],plotData$PC2[14:26],method="pearson")
##
## Pearson's product-moment correlation
##
## data: plotData$PC2[1:13] and plotData$PC2[14:26]
## t = 5.4008, df = 11, p-value = 0.0002164
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5677182 0.9548235
## sample estimates:
## cor
## 0.8521479
cor.test(plotData$PC3[1:13],plotData$PC3[14:26],method="pearson")
##
## Pearson's product-moment correlation
##
## data: plotData$PC3[1:13] and plotData$PC3[14:26]
## t = 3.0563, df = 11, p-value = 0.01092
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2021453 0.8946116
## sample estimates:
## cor
## 0.6776541